library(readr)
df <- read_csv("~/df.csv")

Research idea

First of all, I would like to introduce my own research question on which I will be answering in this project:

“I want to find a University where I will receive a large salary after graduation”. Another limitation is my desire to study either political science or sociology.

I will focus on choosing variables first and EDA then.

Data preparation

So, this is mostly market-oriented project, therefore I want only income variable and score of teaching (we still want to study in the best uni on the planet!):

#colnames(df)

library(dplyr)
df = df %>% 
  select(-scores_research, -scores_research_rank, -scores_citations, -scores_citations_rank, -stats_pc_intl_students, -scores_international_outlook, -scores_overall)

Deleting also some half-metadata info and ranks (prefer working with scores), also deleted location (applicants from all over the world and ready to move):

df = df %>% 
  select(-rank_order, -rank, -record_type, -url, -member_level, -aliases, -closed, -apply_link, -ref_link, -scores_overall_rank, -scores_teaching_rank, -scores_industry_income_rank, -scores_international_outlook_rank, -stats_female_male_ratio, -location, -stats_female_share, -stats_number_students, -stats_student_staff_ratio)
#df$scores_overall = strtrim(df$scores_overall, 4) #get rid of "50.6–54.2"
#df$scores_overall = as.numeric(df$scores_overall)

I will also create dummies for subject in order to get some interesting clusters:

library(dplyr)
library(tidyr)

df_subj = df %>% 
  mutate(subjects_offered = strsplit(as.character(subjects_offered), ",")) %>%
  unnest(subjects_offered)
df_subj$subjects_offered = trimws(df_subj$subjects_offered)

library(fastDummies)
data <- dummy_cols(df_subj, select_columns = 'subjects_offered')
data = data %>% select(-subjects_offered)


data[,c(10:45)] <- lapply(data[,c(4:39)] , factor)
#data[, c(10:45)] <- as.factor(data[, c(10:45)])

EDA

library(DataExplorer)
plot_intro(data)

These are our dummy variables in dataset:

plot_bar(data)

Correlations first:

cor_num = data[, c(2:3)]

require(heatmaply)
heatmaply(
  cor(normalize(cor_num)),
  xlab = "Features",
  ylab = "Cars"
)

Overall, correlations looks good for now.

#Standardizing numeric variables:
#data[,c(2:5, 7:9)] <- lapply(data[,c(2:5, 7:9)] , scale)

Here are our numeric variables:

plot_histogram(data)

Distance metric

#categories = data %>% select(nid, name)
data <- data[-c(41:45)]
#data = distinct(data)
data = data %>% select(-nid)
data = data[,c(1:3, 35,37)]
data = distinct(data)



library(cluster)
gower_dist <- daisy(data[ , -1],
                    metric = "gower",
                    type = list(logratio = 3)) 

summary(gower_dist)
## 6739956 dissimilarities, summarized :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1402  0.2975  0.3104  0.4487  0.9782 
## Metric :  mixed ;  Types = I, I, N, N 
## Number of objects : 3672

Fine, types by daisy are true: I have 2 factor variables in the end and 2 numerical.

how many clusters must be here

sil_width <- c(NA)

for(i in 2:10){
  pam_fit <- pam(gower_dist,
                 diss = TRUE,
                 k = i)
  sil_width[i] <- pam_fit$silinfo$avg.width
}
plot(1:10, sil_width,
     xlab = "Number of clusters", xaxt='n',
     ylab = "Silhouette Width",
     ylim = c(0,1))
axis(1, at = seq(2, 10, by = 1), las=2)
lines(1:10, sil_width)

The highest value is 3, so we must choose this number of clusters.

#PAM clustering

pam_fit <- pam(gower_dist, diss = TRUE, k = 3)
pam_fit$data <- data[ , -1]
pam_results <- data %>%
  dplyr::select(-name) %>%
  mutate(cluster = pam_fit$clustering) %>%
  group_by(cluster) %>%
  do(the_summary = summary(.))
pam_results$the_summary
## [[1]]
##  scores_teaching scores_industry_income
##  Min.   :11.70   Min.   : 33.30        
##  1st Qu.:18.60   1st Qu.: 34.80        
##  Median :23.55   Median : 38.50        
##  Mean   :27.81   Mean   : 46.09        
##  3rd Qu.:32.50   3rd Qu.: 50.00        
##  Max.   :94.40   Max.   :100.00        
##  subjects_offered_Politics & International Studies (incl Development Studies)
##  0:1526                                                                      
##  1:   0                                                                      
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  subjects_offered_Sociology    cluster 
##  0:1526                     Min.   :1  
##  1:   0                     1st Qu.:1  
##                             Median :1  
##                             Mean   :1  
##                             3rd Qu.:1  
##                             Max.   :1  
## 
## [[2]]
##  scores_teaching scores_industry_income
##  Min.   :11.70   Min.   : 33.30        
##  1st Qu.:18.90   1st Qu.: 34.60        
##  Median :24.40   Median : 38.00        
##  Mean   :28.86   Mean   : 45.19        
##  3rd Qu.:34.00   3rd Qu.: 46.70        
##  Max.   :94.40   Max.   :100.00        
##  subjects_offered_Politics & International Studies (incl Development Studies)
##  0:1051                                                                      
##  1:   0                                                                      
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  subjects_offered_Sociology    cluster 
##  0:   0                     Min.   :2  
##  1:1051                     1st Qu.:2  
##                             Median :2  
##                             Mean   :2  
##                             3rd Qu.:2  
##                             Max.   :2  
## 
## [[3]]
##  scores_teaching scores_industry_income
##  Min.   :11.70   Min.   : 33.30        
##  1st Qu.:18.50   1st Qu.: 35.20        
##  Median :23.20   Median : 39.40        
##  Mean   :27.88   Mean   : 47.28        
##  3rd Qu.:32.00   3rd Qu.: 51.70        
##  Max.   :94.40   Max.   :100.00        
##  subjects_offered_Politics & International Studies (incl Development Studies)
##  0:   0                                                                      
##  1:1095                                                                      
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  subjects_offered_Sociology    cluster 
##  0:1095                     Min.   :3  
##  1:   0                     1st Qu.:3  
##                             Median :3  
##                             Mean   :3  
##                             3rd Qu.:3  
##                             Max.   :3

Cluster description on PAM method:

From our customer’s point of view the better option is political sciences and higher income (cluster 3), and best choices will be:

data[pam_fit$medoids, ]
## # A tibble: 3 x 5
##   name    scores_teaching scores_industry_… `subjects_offered_… subjects_offere…
##   <chr>             <dbl>             <dbl> <fct>               <fct>           
## 1 Univer…            23.4              38.4 0                   0               
## 2 Niigat…            23.9              38.5 0                   1               
## 3 Univer…            23.5              39.5 1                   0
set.seed(99)
library(Rtsne)
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)

tsne_data <- tsne_obj$Y %>% # $Y is a matrix containing the new representations for the objects
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(cluster = factor(pam_fit$clustering),
         name = data$name)
library(ggplot2)
ggplot(aes(x = X, y = Y), data = tsne_data) +
  geom_point(aes(color = cluster))

Exactly three clusters on the plot!